
global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/4_map_sic4_ind6090.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {


* 4_

/* This do-file matches the SIC industries to ind6090 

Using information from the pdf "ind90 - SIC.pdf", technical paper #65 from uscensusbureau (2003)
"The relationship between the 1990 census and census 2000 industry and occupation classification systems"

* Inputs:
- ind90_sic.dta, created by hand based on U.S. Census Bureau. 2003. "The Relationship Between The 1990 Census and Census 2000 Industry and Occupation Classification Systems." Technical Paper #65, U.S. Department of Commerce.
- ind90.dta 



Output:
- patents_ind6090
*/


* -----------------------------------
* A.Import and prep ind90 concordance
* -----------------------------------
*by hand, based on U.S. Census Bureau. 2003. "The Relationship Between The 1990 Census and Census 2000 Industry and Occupation Classification Systems." Technical Paper #65, U.S. Department of Commerce.
use ${alm_data_raw}/concordances/ind90_sic/ind90_sic.dta, clear
* by hand based on the following
* source https://www.census.gov/library/working-papers/2003/demo/tp-65.html
*link last checked 2024-08-04

drop SIC
drop if SIC4digit==""

* 7219 special case
expand 2 if SIC4digit=="7210 exc. part 7219"
sort SIC4digit
quietly by SIC4digit:  gen dup2 = cond(_N==1,0,_n) if SIC4digit=="7210 exc. part 7219" 
replace SIC4digit="7219" if dup2==2
drop dup
replace SIC4digit="7210 exc. 7219" if SIC4digit=="7210 exc. part 7219" 

*Generate weights - only in very few cases required - otherwise 1:1 mapping
*we split the sic code equally between the two industries
duplicates tag SIC, gen(dup)
replace dup=dup+1
gen weight=1/dup
drop dup Descr

gen temp=strlen(SIC4digit)
replace SIC4digit="0" + SIC4digit if temp==3
drop temp
rename SIC4digit SIC
label var weight "Weight for split SIC codes (1= no split)"
tempfile ind90_sic
save "`ind90_sic'", replace

* ------------------------------------
* B. Apply ind90 to sic's
* -------------------------------------

use sic_ind_flow using ${alm_data_proc}/ipc4_sic4_ind_using_concordance.dta, clear
duplicates drop
gen SIC=sic_ind_flow

*Aggregate some industries to match with concordance table
gen sic_2=substr(sic_ind_flow, 1,2)
replace sic_2=sic_2+"00"
foreach x in 0100 0200 0800 0900 1000 1200 1300 1400 1500 1600 1700 2100 2500 4000 4300 4400 4500 4600 4700  5800 6100 6200 6300 6400 6500 6700 8100 8400 8800 9200 9300 9400 9500 9600 9700 {
    replace SIC=sic_2 if sic_2=="`x'"
}
gen sic_3=substr(sic_ind_flow, 1,3)
replace sic_3=sic_3+"0"
foreach x in 0710 0720 0740 0750 0760 0780 2010 2020 2030 2040 2050 2060 2070 2080 2090 2210 2220 2230 2240 2250 2260 2270 2280 2290 2310 2320 2330 2340 2350 2360 2370 2380 2390 2410 2420 2430 2440 2450 2490 2610 2620 2630 2650 2670 2710 2720 2730 2740 2750 2760 2770 2780 2790 2810 2820 2830 2840 2850 2860 2870 2890 2910 2950 2990 3010 3020 3050 3060 3080 3110 3130 3140 3150 3160 3170 3190 3210 3220 3230 3240 3250 3260 3270 3280 3290 3310 3320 3340 3340 3390 3410 3420 3430 3440 3450 3460 3470 3480 3490 3510 3520 3530 3540 3550 3560 3580 3590 3610 3620 3630 3640 3650 3660 3670 3690 3710 3720 3730 3740 3750 3760 3790 3810 3840 3850 3860 3870 3940 4120 4210 4220 4230 4810 4820 4830 4840 4890 4910 4920 4930 4940 4950 4960 4970 5010 5020 5030 5040 5050 5060 5070 5080 5110 5120 5130 5140 5150 5160 5170 5180 5210 5230 5250 5260 5270 5310 5330 5390 5410 5420 5430 5440 5450 5460 5490 5510 5520 5530 5540 5550 5560 5570 5590 5660 5710 5720 5910 5920 5930 5980 6030 6060 7010 7020 7030 7040 7220 7230 7240 7250 7260 7290 7310 7320 7330 7340 7350 7360 7370 7510 7520 7530 7620 7630 7640 7810 7820 7830 7840 7910 7920 7930 7940 7990 8010 8020 8030 8060 8070 8080 8090 8210 8220 8230 8240 8290 8320 8330 8350 8350 8360 8390 8610 8620 8630 8640 8650 8660 8690 8710 8720 8730 8740 8990 9110 9120 9130 9190 {
    replace SIC=sic_3 if sic_3=="`x'"
}

*Special cases
replace SIC="3820 exc. 3827" if sic_3=="3820" & SIC!="3827"
replace SIC="3900 exc. 3940" if sic_2=="3900" & SIC!="3940"
replace SIC="4100 exc. 4120" if sic_2=="4100" & SIC!="4120"
replace SIC="5090 exc. 5093" if sic_3=="5090" & SIC!="5093"
replace SIC="5600 exc. 5660" if sic_2=="5600" & SIC!="5660"
replace SIC="6000 exc. 6030 and 6060" if sic_2=="6000" & SIC!="6030" & SIC!="6060"
replace SIC="7210 exc. 7219" if sic_3=="7210" & SIC!="7219"

* join patents with concordance table SIC to ind90
joinby SIC using "`ind90_sic'", unmatched(both)

*The only SIC of the master file which cannot be mapped is 3356—Rolling, Drawing, and Extruding of Nonferrous Metals, Except Copper and Aluminum
*Very likely that it belongs to ind90 280 - "Other primary metal industries"
*Assign it to this
replace ind90=280 if SIC=="3356"
replace weight=1 if SIC=="3356"
drop if _merge==2 
assert _merge==3 if SIC!="3356"
drop _merge

* --------------------------------
* c. Map into ind6090
* --------------------------------
*Autor, David H. "The Skill Content of Recent Technological Change: An Empirical Exploration" 
*Quarterly Journal of Economics, 118(4), November 2003, 1279-1334. Accessed November 2018. https://economics.mit.edu/people/faculty/david-h-autor/data-archive.
joinby ind90 using ${alm_data_raw}/concordances/ind90_sic/ind90.dta, unmatched(both)
drop ind7090 ind8090 dind8090 dind7090 mind8090 mind7090 
drop if _merge==2
assert _merge==3
drop _merge
ren sic_ind_flow sic4
duplicates drop

label var ind6090 "Consistent ALM industry code"
save ${alm_data_proc}/cw_sic4_ind6090.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat